###This file is to generate bond-level transaction cost measures for TRACE regular
###Need to extraly merge with bloomberg bond fundamentals

###And this code also combine regular and EOD bond-level data and run regressions.
library(stringr)
library(data.table)
library(lubridate)
library(tictoc)
library(dplyr)
library(dtplyr)
library(zoo)
library(openxlsx)
library(taRifx)
library(haven)
library(sandwich)
library(lmtest)
library(sqldf)
library(ggplot2)
library(stringr)
library(data.table)
library(lubridate)
library(tictoc)
library(dplyr)
library(dtplyr)
library(zoo)
library(stats) 
library(gtools)

rm(list=ls())
options(max.print=999999)



# Sets the current path
current_path <- "XXX"
setwd(current_path)

#setwd("~/Corporate_Bond_TRACE/data/TRACE_standard")

#"Additional_bond_attributes.csv" is bond domicile ("COUNTRY_DOMICILE") information from Mergent FISD-Bond Issue data
Bond_domicile<-fread(paste0(current_path,"/Additional_bond_attributes.csv"))

TradeData_filter_masterfundamental<-
  fread(paste0(current_path,"/Data_FilterMerge_standard_TRACE.csv"))

print(min(TradeData_filter_masterfundamental$trd_exctn_dt_s))
print(max(TradeData_filter_masterfundamental$trd_exctn_dt_s))
nrow(TradeData_filter_masterfundamental)

###NOTE: after obtaining more offering dates, need to firstly filter for time_to_offering, before matching CDC.

print(nrow(TradeData_filter_masterfundamental))
TradeData_filter_masterfundamental<-
  TradeData_filter_masterfundamental[(time_to_offering>=90)|(is.na(time_to_offering))]
print(nrow(TradeData_filter_masterfundamental))

#################################################################################################
###Part-1: process data

#################################################################################################
# distribution of price and quantity #
#################################################################################################
print(summary(TradeData_filter_masterfundamental$rptd_pr))
print(summary(TradeData_filter_masterfundamental$`Quantity(ParVal)_num`))
table(TradeData_filter_masterfundamental[is.na(`Quantity(ParVal)_num`),
                                         c("Quantity(ParVal)"),
                                         with=FALSE])
l01pt_price<-quantile(as.matrix(TradeData_filter_masterfundamental$rptd_pr),probs = 0.001,na.rm = TRUE)
h01pt_price<-quantile(as.matrix(TradeData_filter_masterfundamental$rptd_pr),probs = 0.999,na.rm = TRUE)
l01pt_vol<-quantile(as.matrix(TradeData_filter_masterfundamental$`Quantity(ParVal)_num`),probs = 0.001,na.rm = TRUE)
h01pt_vol<-quantile(as.matrix(TradeData_filter_masterfundamental$`Quantity(ParVal)_num`),probs = 0.999,na.rm = TRUE)

TradeData_filter_masterfundamental[rptd_pr<l01pt_price,`:=`(rptd_pr=l01pt_price)]
TradeData_filter_masterfundamental[rptd_pr>h01pt_price,`:=`(rptd_pr=h01pt_price)]
TradeData_filter_masterfundamental[`Quantity(ParVal)_num`<l01pt_vol,`:=`(`Quantity(ParVal)_num`=l01pt_vol)]

#################################################################################################
# dummies for interdealer, customer, dealer-sell-to-customer and customer-sell-to-dealer trades #
#################################################################################################
TradeData_filter_masterfundamental[,`:=`(dum_interdealer=0, dum_customer=0, dum_dealer_to_customer=0, dum_customer_to_dealer=0)]
TradeData_filter_masterfundamental[trade_type=="D2D", dum_interdealer:=1]
TradeData_filter_masterfundamental[trade_type=="C2D" | trade_type=="D2C" , dum_customer:=1]
TradeData_filter_masterfundamental[trade_type=="D2C", 
                                   dum_dealer_to_customer:=1]
TradeData_filter_masterfundamental[trade_type=="C2D", 
                                   dum_customer_to_dealer:=1]

#################################################################################################
# execution time #
#################################################################################################
# TradeData_filter_masterfundamental[,datetime_exctn := 
#                    paste0(trd_exctn_dt,"T",str_pad(`Execution Time`,width=6,side="left",pad="0"),
#                           "Z")]
TradeData_filter_masterfundamental[,datetime_exctn := ymd_hms(datetime_exctn)]

TradeData_filter_masterfundamental[,`:=`(datetime_exctn=as_datetime(datetime_exctn))]
TradeData_filter_masterfundamental[,`:=`(new_datetime_exctn=as_datetime(datetime_exctn))]

merge_index<-c("cusip_id","bond_sym_id","company_symbol")
setcolorder(TradeData_filter_masterfundamental,
            c(merge_index,"trd_exctn_dt_s",
              "datetime_exctn","new_datetime_exctn"))

#################################################################################################
# time_to_maturity with 5 years cutoff #
#################################################################################################
###this step may also drop fundamentals=NA obs, mostly may be new issuance.
TradeData_filter_masterfundamental<-
  TradeData_filter_masterfundamental[time_to_maturity>=-10]
print(nrow(TradeData_filter_masterfundamental))
TradeData_filter_masterfundamental[,`:=`(dum_TTM5yless=0,
                                         dum_TTM5ymore=0)]
TradeData_filter_masterfundamental[time_to_maturity<=1825,
                                   `:=`(dum_TTM5yless=1)]
TradeData_filter_masterfundamental[time_to_maturity>1825,
                                   `:=`(dum_TTM5ymore=1)]

#################################################################################################
# bond grade #
#################################################################################################

###$$$###
TradeData_filter_masterfundamental[,`:=`(bond_grade=mean_grade)]
#TradeData_filter_masterfundamental[,`:=`(bond_grade=new_bond_grade4)]
#TradeData_filter_masterfundamental[,`:=`(bond_grade=new_bond_grade3)]
#TradeData_filter_masterfundamental[,`:=`(bond_grade=new_bond_grade2)]
#TradeData_filter_masterfundamental[,`:=`(bond_grade=new_bond_grade)]
#TradeData_filter_masterfundamental[,`:=`(bond_grade=mean_grade)]
print(table(TradeData_filter_masterfundamental$bond_grade))
print(message(paste0("number of IG bonds is ",nrow(unique(as.matrix(TradeData_filter_masterfundamental[bond_grade=="I",c("cusip_id"),with=FALSE]))))))
print(message(paste0("number of HY bonds is ",nrow(unique(as.matrix(TradeData_filter_masterfundamental[bond_grade=="H",c("cusip_id"),with=FALSE]))))))
print(message(paste0("number of NR bonds is ",nrow(unique(as.matrix(TradeData_filter_masterfundamental[bond_grade=="NR",c("cusip_id"),with=FALSE]))))))

TradeData_filter_masterfundamental[
  ,
  `:=`(credit_rating_code_RD=mean_rating_code)
]

#################################################################################################
# industry and US/nonUS #
#################################################################################################
# TradeData_filter_masterfundamental<-
#   left_join(TradeData_filter_masterfundamental,Bond_domicile_new,by=c("issuer_cusip"))

TradeData_filter_masterfundamental<-as.data.table(TradeData_filter_masterfundamental)

TradeData_filter_masterfundamental[,`:=`(INDUSTRY="",
                                         US_indicator="")]

TradeData_filter_masterfundamental[industry_code>=20&industry_code<=26,
                                   `:=`(INDUSTRY="Finance")]
TradeData_filter_masterfundamental[industry_code==12,
                                   `:=`(INDUSTRY="Oil&Gas")]
TradeData_filter_masterfundamental[industry_code==16,
                                   `:=`(INDUSTRY="Transportation")]
TradeData_filter_masterfundamental[industry_code==30|industry_code==31|industry_code==33,
                                   `:=`(INDUSTRY="Utility")]

print(table(TradeData_filter_masterfundamental$INDUSTRY))


Bond_domicile[,`:=`(count_seq=sequence(.N)),by=c("issuer_cusip")]
Bond_domicile<-Bond_domicile[count_seq==1]
Bond_domicile_new<-Bond_domicile[,c("issuer_cusip","country_domicile","Industry"),with=FALSE]
setnames(TradeData_filter_masterfundamental,c("issuer_cusip.x"),c("issuer_cusip"))
TradeData_filter_masterfundamental<-left_join(TradeData_filter_masterfundamental,Bond_domicile_new,by=c("issuer_cusip"))

TradeData_filter_masterfundamental<-as.data.table(TradeData_filter_masterfundamental)
print(message(paste0("prop of missing country_domicile is ",nrow(TradeData_filter_masterfundamental[is.na(country_domicile)])/
                       nrow(TradeData_filter_masterfundamental))))

TradeData_filter_masterfundamental[country_domicile=="USA",
                                   `:=`(US_indicator="US")]
TradeData_filter_masterfundamental[country_domicile!="USA"&(!is.na(country_domicile)),
                                   `:=`(US_indicator="non-US")]

print(table(TradeData_filter_masterfundamental$INDUSTRY))
print(table(TradeData_filter_masterfundamental$Industry))
print(table(TradeData_filter_masterfundamental$US_indicator))

TradeData_filter_masterfundamental[,`:=`(dum_US=0,
                                         dum_nonUS=0,
                                         dum_Finance=0,
                                         dum_OilGas=0,dum_Transportation=0,dum_Utility=0)]

TradeData_filter_masterfundamental[INDUSTRY=="Finance",`:=`(dum_Finance=1)]
TradeData_filter_masterfundamental[INDUSTRY=="Oil&Gas",`:=`(dum_OilGas=1)]
TradeData_filter_masterfundamental[INDUSTRY=="Transportation",`:=`(dum_Transportation=1)]
TradeData_filter_masterfundamental[INDUSTRY=="Utility",`:=`(dum_Utility=1)]
TradeData_filter_masterfundamental[US_indicator=="US",`:=`(dum_US=1)]
TradeData_filter_masterfundamental[US_indicator=="non-US",`:=`(dum_nonUS=1)]

###Define other dummies
###(1) trading volume cutoffs: 10,000, 1,000,000 and 5,000,000
vol_cutoff1<-100000
vol_cutoff2<-1000000
vol_cutoff3<-5000000
summary(TradeData_filter_masterfundamental$`Quantity(ParVal)_num`)
table(TradeData_filter_masterfundamental[is.na(`Quantity(ParVal)_num`),c("Quantity(ParVal)"),with=FALSE])
TradeData_filter_masterfundamental[
  (`Quantity(ParVal)`=="1MM+"),
  `:=`(`Quantity(ParVal)_num`=1000000)
  ]
TradeData_filter_masterfundamental[
  (`Quantity(ParVal)`=="5MM+"),
  `:=`(`Quantity(ParVal)_num`=5000000)
  ]
summary(TradeData_filter_masterfundamental$`Quantity(ParVal)_num`)
TradeData_filter_masterfundamental[
  ,
  `:=`(category_trade_size="")
  ]
TradeData_filter_masterfundamental[
  (`Quantity(ParVal)_num`<vol_cutoff1),
  `:=`(category_trade_size="Micro")
  ]
TradeData_filter_masterfundamental[
  (`Quantity(ParVal)_num`>=vol_cutoff1)&
    (`Quantity(ParVal)_num`<vol_cutoff2),
  `:=`(category_trade_size="Odd")
  ]
TradeData_filter_masterfundamental[
  (`Quantity(ParVal)_num`>=vol_cutoff2)&
    (`Quantity(ParVal)_num`<vol_cutoff3),
  `:=`(category_trade_size="Round")
  ]
TradeData_filter_masterfundamental[
  (`Quantity(ParVal)_num`>=vol_cutoff3),
  `:=`(category_trade_size="Block")
  ]
table(TradeData_filter_masterfundamental$category_trade_size)
###(2)crisis and intervention
TradeData_filter_masterfundamental[
  ,
  `:=`(dum_crisis=0,
       dum_intervention=0)
  ]
TradeData_filter_masterfundamental[
  (trd_exctn_dt>=20200305)&
    (trd_exctn_dt<=20200319),
  `:=`(dum_crisis=1)
]
TradeData_filter_masterfundamental[
  (trd_exctn_dt>20200319),
  `:=`(dum_intervention=1)
  ]
table(TradeData_filter_masterfundamental[dum_crisis==1,c("trd_exctn_dt_s"),with=FALSE])
table(TradeData_filter_masterfundamental[dum_intervention==1,c("trd_exctn_dt_s"),with=FALSE])

###(3)rename existinng bond characteristics
TradeData_filter_masterfundamental[
  ,
  `:=`(bond_age_yr=time_to_offering/365,
       time_to_maturity_yr=time_to_maturity/365)
  ]
summary(TradeData_filter_masterfundamental$bond_age_yr)
summary(TradeData_filter_masterfundamental$time_to_maturity_yr)

ExtraDum<-c("dum_US","dum_nonUS","dum_Finance","dum_OilGas","dum_Transportation","dum_Utility",
            "dum_TTM5ymore","dum_TTM5yless")
###CHANGE here to look at different dimensions.

setnames(TradeData_filter_masterfundamental,
         c("Quantity(ParVal)","Quantity(ParVal)_num"),
         c("Quantity","Quantity_num"))

#firstly look at all C2D, D2D and D2C, then specifically look at REMAINING C2D_RC and D2C_RC
TradeData_filter_masterfundamental[,ID:=.I]

TMP_C2D <- TradeData_filter_masterfundamental[(dum_customer_to_dealer==1),
                                              c("cusip_id","trd_exctn_dt","new_datetime_exctn","rptd_pr","Quantity_num","bond_grade","Quantity",
                                                "trade_type","remuneration","ID",ExtraDum),
                                              with=FALSE]

TMP_D2D <- TradeData_filter_masterfundamental[dum_interdealer==1,c("cusip_id","trd_exctn_dt","new_datetime_exctn","rptd_pr","Quantity_num","bond_grade","Quantity",
                                                                   "trade_type","remuneration","ID",ExtraDum),
                                              with=FALSE]

TMP_D2C <- TradeData_filter_masterfundamental[(dum_dealer_to_customer==1),c("cusip_id","trd_exctn_dt","new_datetime_exctn","rptd_pr","Quantity_num","bond_grade","Quantity",
                                                                            "trade_type","remuneration","ID",ExtraDum),
                                              with=FALSE]

setnames(TMP_C2D,c("ID"),c("C2D_ID"))
setnames(TMP_D2D,c("ID"),c("D2D_ID"))
setnames(TMP_D2C,c("ID"),c("D2C_ID"))

print(nrow(TMP_C2D)+nrow(TMP_D2D)+nrow(TMP_D2C))
print(nrow(TradeData_filter_masterfundamental))

setnames(TMP_D2C,
         c("cusip_id","trd_exctn_dt","new_datetime_exctn","rptd_pr","Quantity_num","bond_grade","Quantity",
           ExtraDum),
         paste0(c("cusip_id","trd_exctn_dt","new_datetime_exctn","rptd_pr","Quantity_num","bond_grade","Quantity",
                  ExtraDum),"_D2C"))

###update 0403, only change standard to be within 15 min. 
#round-1:TMP_C2D and TMP_D2C
###CHECKED timediff is in seconds
TMP_Agency_1 <- data.table(sqldf('select *,
                                 b.new_datetime_exctn_D2C-a.new_datetime_exctn as timediff
                                 from TMP_C2D as a, TMP_D2C as b
                                 where a.cusip_id=b.cusip_id_D2C and a.trd_exctn_dt=b.trd_exctn_dt_D2C and a.Quantity=b.Quantity_D2C and timediff<=900 and timediff>=0
                                 order by C2D_ID, timediff'))
nrow(TMP_Agency_1)
TMP_Agency_1[,`:=`(count_seq=sequence(.N)),
             by=c("C2D_ID")]
TMP_Agency_1<-TMP_Agency_1[count_seq==1]
nrow(TMP_Agency_1)

match_TMP_C2D<-TMP_C2D[C2D_ID %in% as.vector(TMP_Agency_1$C2D_ID)]
unmatch_TMP_C2D<-TMP_C2D[!(C2D_ID %in% as.vector(TMP_Agency_1$C2D_ID))]
match_TMP_D2C<-TMP_D2C[D2C_ID %in% as.vector(TMP_Agency_1$D2C_ID)]
unmatch_TMP_D2C<-TMP_D2C[!(D2C_ID %in% as.vector(TMP_Agency_1$D2C_ID))]

###calculate daily_irc_byvol
TMP_Agency_1[,`:=`(max_prc_pairwise=pmax(rptd_pr, rptd_pr_D2C, na.rm = TRUE),
                   min_prc_pairwise=pmin(rptd_pr, rptd_pr_D2C, na.rm = TRUE))]

TMP_Agency_1[,daily_irc_byvol:=(max_prc_pairwise-min_prc_pairwise)/max_prc_pairwise,
             by=.(trd_exctn_dt,cusip_id,`Quantity_num`)]

# winsorized at the 1% level each day
TMP_Agency_1[,
             `:=`(high1pt_daily_irc_byvol=quantile(daily_irc_byvol,probs = 0.99,na.rm = TRUE),
                  low1pt_daily_irc_byvol=quantile(daily_irc_byvol,probs = 0.01,na.rm = TRUE)),
             by=c("trd_exctn_dt")]

TMP_Agency_1[daily_irc_byvol>high1pt_daily_irc_byvol,
             `:=`(daily_irc_byvol=high1pt_daily_irc_byvol)]
TMP_Agency_1[daily_irc_byvol<low1pt_daily_irc_byvol,
             `:=`(daily_irc_byvol=low1pt_daily_irc_byvol)]

low1pt_allsample_Quantity_num<-quantile(TMP_Agency_1$`Quantity_num`,0.01,na.rm = TRUE)

TMP_Agency_1[`Quantity_num`<low1pt_allsample_Quantity_num,
             `:=`(`Quantity_num`=low1pt_allsample_Quantity_num)]

print(summary(TMP_Agency_1$`Quantity_num`))

# fwrite(TMP_Agency_1,paste0("TMP_Agency_1_calculate_new_IRC_0505.csv"))
# fwrite(TMP_Agency_1,paste0("TMP_Agency_1_calculate_new_IRC_0513.csv"))
# fwrite(TMP_Agency_1,paste0("TMP_Agency_1_calculate_new_IRC_0524.csv"))
# fwrite(TMP_Agency_1,paste0("TMP_Agency_1_calculate_new_IRC_0607.csv"))

###transaction-wise MIRC
Transactionwise_MIRC<-
  TMP_Agency_1[,c("C2D_ID","daily_irc_byvol"),with=FALSE]
setnames(Transactionwise_MIRC,
         c("C2D_ID","daily_irc_byvol"),
         c("ID","MIRC"))
Transactionwise_MIRC[
  ,
  `:=`(MIRC=as.numeric(MIRC),
       ID=as.numeric(ID))
  ]
###IMPORTANT: need to later save the agency-capacity data since to calculate CH, we temporarily drop
###agency transactions from TradeData_filter_masterfundmantal
TradeData_filter_masterfundamental<-
  as.data.table(left_join(TradeData_filter_masterfundamental,
                          Transactionwise_MIRC,
                          by=c("ID")))
nrow(TMP_Agency_1)
nrow(TradeData_filter_masterfundamental[!is.na(MIRC)])

TradeData_filter_masterfundamental[,
                                   `:=`(MIRC_bondday=weighted.mean(MIRC, w = `Quantity_num`, 
                                                                   na.rm = TRUE)), by=c("trd_exctn_dt","cusip_id")]
summary(TradeData_filter_masterfundamental$MIRC)
summary(TradeData_filter_masterfundamental$MIRC_bondday)

######################################################################################
#round-2: unmatch_TMP_C2D and TMP_D2D
TMP_Agency_2 <- data.table(sqldf('select a.C2D_ID, b.D2D_ID,
                                 b.new_datetime_exctn-a.new_datetime_exctn as timediff
                                 from unmatch_TMP_C2D as a, TMP_D2D as b
                                 where a.cusip_id=b.cusip_id and a.trd_exctn_dt=b.trd_exctn_dt and a.Quantity=b.Quantity and timediff<=900 and timediff>=0
                                 order by C2D_ID, timediff'))

nrow(TMP_Agency_2)
TMP_Agency_2[,`:=`(count_seq=sequence(.N)),
             by=c("C2D_ID")]
TMP_Agency_2<-TMP_Agency_2[count_seq==1]
nrow(TMP_Agency_2)

match_TMP_C2D<-rbind(match_TMP_C2D,unmatch_TMP_C2D[C2D_ID %in% as.vector(TMP_Agency_2$C2D_ID)])
unmatch_TMP_C2D<-unmatch_TMP_C2D[!(C2D_ID %in% as.vector(TMP_Agency_2$C2D_ID))]
match_TMP_D2D<-TMP_D2D[D2D_ID %in% as.vector(TMP_Agency_2$D2D_ID)]
unmatch_TMP_D2D<-TMP_D2D[!(D2D_ID %in% as.vector(TMP_Agency_2$D2D_ID))]

#round-3: unmatch_TMP_D2C and unmatch_TMP_D2D
setnames(unmatch_TMP_D2C,
         c("cusip_id_D2C","trd_exctn_dt_D2C", "new_datetime_exctn_D2C", "rptd_pr_D2C",  "Quantity_num_D2C", "Quantity_D2C",      
           "bond_grade_D2C",paste0(ExtraDum,"_D2C")),
         c("cusip_id","trd_exctn_dt", "new_datetime_exctn", "rptd_pr",  "Quantity_num", "Quantity",      
           "bond_grade",ExtraDum))
setnames(match_TMP_D2C,
         c("cusip_id_D2C","trd_exctn_dt_D2C", "new_datetime_exctn_D2C", "rptd_pr_D2C",  "Quantity_num_D2C", "Quantity_D2C",     
           "bond_grade_D2C",paste0(ExtraDum,"_D2C")),
         c("cusip_id","trd_exctn_dt", "new_datetime_exctn", "rptd_pr",  "Quantity_num", "Quantity",       
           "bond_grade",ExtraDum))
TMP_Agency_3 <- data.table(sqldf('select a.D2D_ID, b.D2C_ID,
                                 b.new_datetime_exctn-a.new_datetime_exctn as timediff
                                 from unmatch_TMP_D2D as a, unmatch_TMP_D2C as b
                                 where a.cusip_id=b.cusip_id and a.trd_exctn_dt=b.trd_exctn_dt and a.Quantity=b.Quantity and timediff<=900 and timediff>=0
                                 order by D2D_ID, timediff'))
nrow(TMP_Agency_3)
TMP_Agency_3[,`:=`(count_seq=sequence(.N)),
             by=c("D2D_ID")]
TMP_Agency_3<-TMP_Agency_3[count_seq==1]
nrow(TMP_Agency_3)

match_TMP_D2C<-rbind(match_TMP_D2C,unmatch_TMP_D2C[D2C_ID %in% as.vector(TMP_Agency_3$D2C_ID)],fill=TRUE)
unmatch_TMP_D2C<-unmatch_TMP_D2C[!(D2C_ID %in% as.vector(TMP_Agency_3$D2C_ID))]
match_TMP_D2D<-rbind(match_TMP_D2D,unmatch_TMP_D2D[D2D_ID %in% as.vector(TMP_Agency_3$D2D_ID)],fill=TRUE)
unmatch_TMP_D2D<-unmatch_TMP_D2D[!(D2D_ID %in% as.vector(TMP_Agency_3$D2D_ID))]

nrow(match_TMP_C2D)+nrow(unmatch_TMP_C2D)
nrow(TMP_C2D)
nrow(match_TMP_D2C)+nrow(unmatch_TMP_D2C)
nrow(TMP_D2C)
nrow(match_TMP_D2D)+nrow(unmatch_TMP_D2D)
nrow(TMP_D2D)

###for next three rounds, relax par value equality restriction.
#round-4:unmatch_TMP_C2D and unmatch_TMP_D2C
TMP_Agency_4 <- data.table(sqldf('select a.C2D_ID, b.D2C_ID,
                                 b.new_datetime_exctn-a.new_datetime_exctn as timediff
                                 from unmatch_TMP_C2D as a, unmatch_TMP_D2C as b
                                 where a.cusip_id=b.cusip_id and a.trd_exctn_dt=b.trd_exctn_dt and a.Quantity_num>=b.Quantity_num and timediff<=900 and timediff>=0
                                 order by C2D_ID, timediff'))
nrow(TMP_Agency_4)
TMP_Agency_4[,`:=`(count_seq=sequence(.N)),
             by=c("C2D_ID")]
TMP_Agency_4<-TMP_Agency_4[count_seq==1]
nrow(TMP_Agency_4)

match_TMP_C2D<-rbind(match_TMP_C2D,unmatch_TMP_C2D[C2D_ID %in% as.vector(TMP_Agency_4$C2D_ID)])
unmatch_TMP_C2D<-unmatch_TMP_C2D[!(C2D_ID %in% as.vector(TMP_Agency_4$C2D_ID))]
match_TMP_D2C<-rbind(match_TMP_D2C,unmatch_TMP_D2C[D2C_ID %in% as.vector(TMP_Agency_4$D2C_ID)])
unmatch_TMP_D2C<-unmatch_TMP_D2C[!(D2C_ID %in% as.vector(TMP_Agency_4$D2C_ID))]

#round-5: unmatch_TMP_C2D and unmatch_TMP_D2D
TMP_Agency_5 <- data.table(sqldf('select a.C2D_ID, b.D2D_ID,
                                 b.new_datetime_exctn-a.new_datetime_exctn as timediff
                                 from unmatch_TMP_C2D as a, unmatch_TMP_D2D as b
                                 where a.cusip_id=b.cusip_id and a.trd_exctn_dt=b.trd_exctn_dt and a.Quantity_num>=b.Quantity_num and timediff<=900 and timediff>=0
                                 order by C2D_ID, timediff'))

nrow(TMP_Agency_5)
TMP_Agency_5[,`:=`(count_seq=sequence(.N)),
             by=c("C2D_ID")]
TMP_Agency_5<-TMP_Agency_5[count_seq==1]
nrow(TMP_Agency_5)

match_TMP_C2D<-rbind(match_TMP_C2D,unmatch_TMP_C2D[C2D_ID %in% as.vector(TMP_Agency_5$C2D_ID)])
unmatch_TMP_C2D<-unmatch_TMP_C2D[!(C2D_ID %in% as.vector(TMP_Agency_5$C2D_ID))]
match_TMP_D2D<-rbind(match_TMP_D2D,unmatch_TMP_D2D[D2D_ID %in% as.vector(TMP_Agency_5$D2D_ID)])
unmatch_TMP_D2D<-unmatch_TMP_D2D[!(D2D_ID %in% as.vector(TMP_Agency_5$D2D_ID))]

#round-6:unmatch_TMP_D2C and unmatch_TMP_D2D
TMP_Agency_6 <- data.table(sqldf('select a.D2D_ID, b.D2C_ID,
                                 b.new_datetime_exctn-a.new_datetime_exctn as timediff
                                 from unmatch_TMP_D2D as a, unmatch_TMP_D2C as b
                                 where a.cusip_id=b.cusip_id and a.trd_exctn_dt=b.trd_exctn_dt and a.Quantity_num<=b.Quantity_num and timediff<=900 and timediff>=0
                                 order by D2D_ID, timediff'))
nrow(TMP_Agency_6)
TMP_Agency_6[,`:=`(count_seq=sequence(.N)),
             by=c("D2D_ID")]
TMP_Agency_6<-TMP_Agency_6[count_seq==1]
nrow(TMP_Agency_6)

match_TMP_D2C<-rbind(match_TMP_D2C,unmatch_TMP_D2C[D2C_ID %in% as.vector(TMP_Agency_6$D2C_ID)])
unmatch_TMP_D2C<-unmatch_TMP_D2C[!(D2C_ID %in% as.vector(TMP_Agency_6$D2C_ID))]
match_TMP_D2D<-rbind(match_TMP_D2D,unmatch_TMP_D2D[D2D_ID %in% as.vector(TMP_Agency_6$D2D_ID)])
unmatch_TMP_D2D<-unmatch_TMP_D2D[!(D2D_ID %in% as.vector(TMP_Agency_6$D2D_ID))]

nrow(match_TMP_C2D)+nrow(unmatch_TMP_C2D)
nrow(TMP_C2D)
nrow(match_TMP_D2C)+nrow(unmatch_TMP_D2C)
nrow(TMP_D2C)
nrow(match_TMP_D2D)+nrow(unmatch_TMP_D2D)
nrow(TMP_D2D)


match_ID=c(c(match_TMP_C2D$C2D_ID),c(match_TMP_D2D$D2D_ID),c(match_TMP_D2C$D2C_ID))

TradeData_filter_masterfundamental[,`:=`(capacity="")]
TradeData_filter_masterfundamental[(ID %in% match_ID),
                                   `:=`(capacity="A")]
TradeData_filter_masterfundamental[!(ID %in% match_ID),
                                   `:=`(capacity="P")]
print(table(TradeData_filter_masterfundamental$capacity))
print(nrow(match_TMP_C2D)+nrow(match_TMP_D2D)+nrow(match_TMP_D2C))

TradeData_filter_masterfundamental[remuneration=="C",
                                   `:=`(capacity="A")]
table(TradeData_filter_masterfundamental$capacity)


setnames(TradeData_filter_masterfundamental,
         c("Quantity","Quantity_num"),
         c("Quantity(ParVal)","Quantity(ParVal)_num"))

print(nrow(TradeData_filter_masterfundamental))

###Create new data table to store all transactions
TradeData_filter_masterfundamental_allcapacity<-
  TradeData_filter_masterfundamental

print(nrow(TradeData_filter_masterfundamental_allcapacity))
print(table(TradeData_filter_masterfundamental_allcapacity$capacity))

###exclude agency transactions after calculating IRCs
TradeData_filter_masterfundamental_agency<-
  TradeData_filter_masterfundamental[capacity=="A"]

TradeData_filter_masterfundamental<-
  TradeData_filter_masterfundamental[capacity=="P"]

print(nrow(TradeData_filter_masterfundamental))
print(message(paste0("distribution of capacity in TradeData_filter_masterfundamental is :")))
print(table(TradeData_filter_masterfundamental$capacity))
print(message(paste0("distribution of capacity in TradeData_filter_masterfundamental_allcapacity is :")))
print(table(TradeData_filter_masterfundamental_allcapacity$capacity))

###[2]We need to only use secondary market transactions, i.e. time_to_offering > 90 (we temporarily maintain those trades with time_to_offering as NA)
print(summary(TradeData_filter_masterfundamental$time_to_offering))

print(nrow(TradeData_filter_masterfundamental))

###within the remaining missing time_to_offering, some of them may be new issuance, but since it significantly 
###decreases from 600000, we keep them in the data. (This may need UPDATE LATER)
TradeData_filter_masterfundamental<-
  TradeData_filter_masterfundamental[time_to_offering>=90|is.na(time_to_offering)]

print(nrow(TradeData_filter_masterfundamental))

###Do the same for TradeData_filter_masterfundamental_allcapacity (although in TradeData_filter_masterfundamental, minimum time_to_offering is 125 days)
print(summary(TradeData_filter_masterfundamental_allcapacity$time_to_offering))

print(nrow(TradeData_filter_masterfundamental_allcapacity))

TradeData_filter_masterfundamental_allcapacity<-
  TradeData_filter_masterfundamental_allcapacity[time_to_offering>=90|is.na(time_to_offering)]

print(nrow(TradeData_filter_masterfundamental_allcapacity))

###CH measure

TradeData_filter_masterfundamental[dum_dealer_to_customer==1, customer_buy_indic:=1]
TradeData_filter_masterfundamental[dum_customer_to_dealer==1, customer_buy_indic:=-1]

#################################
# calcualte the reference price #
#################################
# For each customer trade, calculate its reference price as the 
# volume-weighted average price of interdealer trades larger than $100,000 
# in the same bond-day, excluding interdealer trades executed within 15 minutes.
# $100 = par value, and the volume traded (in dollars of par). 


# TradeData_filter_masterfundamental<-fread("INTERMEDIATESAVE_TradeData_filter_masterfundamental.csv")

# TMP_C <- TradeData_filter_masterfundamental[dum_interdealer==0,.(cusip_id,trd_exctn_dt_s,new_datetime_exctn,`Quantity(ParVal)_num`,rptd_pr,
#                                                                  ID)]
# TMP_D <- TradeData_filter_masterfundamental[dum_interdealer==1 & `Quantity(ParVal)_num` >100e3,
#                                             .(cusip_id,trd_exctn_dt_s,new_datetime_exctn, `Quantity(ParVal)_num` ,rptd_pr, ID)]

TMP_C <- TradeData_filter_masterfundamental[dum_interdealer==0,c("cusip_id","trd_exctn_dt","new_datetime_exctn","rptd_pr","Quantity(ParVal)_num","bond_grade",
                                                                 "ID",ExtraDum),
                                            with=FALSE]

TMP_D <- TradeData_filter_masterfundamental[dum_interdealer==1 & `Quantity(ParVal)_num` >100e3,
                                            c("cusip_id","trd_exctn_dt","new_datetime_exctn","rptd_pr","Quantity(ParVal)_num","bond_grade",
                                              "ID",ExtraDum),
                                            with=FALSE]

setnames(TMP_D,c("ID"),c("DD_ID"))
setnames(TMP_C,c("ID"),c("DC_ID"))

setnames(TMP_D,c("cusip_id", "trd_exctn_dt", "new_datetime_exctn", "Quantity(ParVal)_num","rptd_pr","bond_grade",ExtraDum),
         c(paste0(c("cusip_id", "trd_exctn_dt", "new_datetime_exctn", "Quantity(ParVal)_num","rptd_pr","bond_grade",ExtraDum),"_DD")))

TMP_M <- data.table(sqldf('select *
                          from TMP_C as a, TMP_D as b
                          where a.cusip_id=b.cusip_id_DD and a.trd_exctn_dt=b.trd_exctn_dt_DD and (a.new_datetime_exctn>=b.new_datetime_exctn_DD+900 or a.new_datetime_exctn<=b.new_datetime_exctn_DD-900)
                          order by a.cusip_id, a.trd_exctn_dt, a.new_datetime_exctn'))

# TMP_M <- data.table(sqldf('select *
#                           from TMP_D as a, TMP_C as b
#                           where a.cusip_id=b.cusip_id and a.trd_exctn_dt=b.trd_exctn_dt and (a.new_datetime_exctn>=b.new_datetime_exctn+900 or a.new_datetime_exctn<=b.new_datetime_exctn-900)
#                           order by a.cusip_id, a.trd_exctn_dt, a.new_datetime_exctn'))

nrow(TMP_M)
# TMP_M[,`:=`(count_seq=sequence(.N)),
#       by=c("DD_ID")]
TMP_M[,`:=`(count_seq=sequence(.N)),
      by=c("DC_ID")]
# TMP_M<-TMP_M[count_seq==1]
# nrow(TMP_M)

# TMP_M[,`:=`(new_datetime_exctn_DC=as.POSIXct(new_datetime_exctn..10, origin = "1970-01-01 00:00:00"))]

# TMP_M[,ref_pr:=weighted.mean(rptd_pr, w=`Quantity(ParVal)_num`), by=.(trd_exctn_dt,cusip_id)]

TMP_M[,ref_pr:=weighted.mean(rptd_pr_DD, w=`Quantity(ParVal)_num_DD`,na.rm = TRUE), by=.(DC_ID)]

rm(TMP_D)

# ref_prices <- unique(TMP_M[,.(trd_exctn_dt,cusip_id,ref_pr)])
ref_prices <- TMP_M[count_seq==1,c("ref_pr","DC_ID"),
                    with=FALSE]

setnames(ref_prices,c("DC_ID"),c("ID"))

print(nrow(TradeData_filter_masterfundamental))

TradeData_filter_masterfundamental<-
  left_join(TradeData_filter_masterfundamental,
            ref_prices,
            by=c("ID"))

print(nrow(TradeData_filter_masterfundamental))

###there are many transactions with no-reference prices.
print(paste0("calculated reference price is:"))
print(summary(TradeData_filter_masterfundamental$ref_pr))

#################################
# calcualte spread1 in Choi-Huh #
#################################
# All bid-ask spread measures are calculated using trades with par values of $1 million and above
# trade-level
TradeData_filter_masterfundamental<-as.data.table(TradeData_filter_masterfundamental)
TradeData_filter_masterfundamental[`Quantity(ParVal)_num`>= 1000000,
                                   spread1:=2*customer_buy_indic*(rptd_pr/ref_pr-1)]
# winsorized at the 1% level each day
TradeData_filter_masterfundamental[,
                                   `:=`(high1pt_spread1=quantile(spread1,probs = 0.99,
                                                                 na.rm = TRUE),
                                        low1pt_spread1=quantile(spread1,probs = 0.01,
                                                                na.rm = TRUE)),
                                   by=c("trd_exctn_dt_s")]

TradeData_filter_masterfundamental[spread1<low1pt_spread1,
                                   `:=`(spread1=low1pt_spread1)]
TradeData_filter_masterfundamental[spread1>high1pt_spread1,
                                   `:=`(spread1=high1pt_spread1)]

# bond-day-level
TradeData_filter_masterfundamental[,
                                   spread1_bondday:=weighted.mean(spread1, w = `Quantity(ParVal)_num`, na.rm = TRUE), by=.(trd_exctn_dt_s,cusip_id)]
# winsorized at the 1% level each day
TradeData_filter_masterfundamental[,
                                   `:=`(high1pt_spread1_bondday=quantile(spread1_bondday,probs = 0.99,
                                                                         na.rm = TRUE),
                                        low1pt_spread1_bondday=quantile(spread1_bondday,probs = 0.01,
                                                                        na.rm = TRUE)),
                                   by=c("trd_exctn_dt_s")]

TradeData_filter_masterfundamental[spread1_bondday<low1pt_spread1_bondday,
                                   `:=`(spread1_bondday=low1pt_spread1_bondday)]
TradeData_filter_masterfundamental[spread1_bondday>high1pt_spread1_bondday,
                                   `:=`(spread1_bondday=high1pt_spread1_bondday)]

###calcualte spread_CH
TradeData_filter_masterfundamental[,
                                   spread_CH:=2*customer_buy_indic*(rptd_pr/ref_pr-1)]

# bond-day-level
TradeData_filter_masterfundamental[,
                                   spread_CH_bondday:=weighted.mean(spread_CH, w = `Quantity(ParVal)_num`, 
                                                                    na.rm = TRUE), by=.(trd_exctn_dt_s,cusip_id)]
# winsorized at the 1% level each day
print(paste0("distribution of spread_CH_bondday for is: "))
print(summary(TradeData_filter_masterfundamental$spread_CH_bondday))
TradeData_filter_masterfundamental[,
                                   `:=`(high1pt_spread_CH_bondday=quantile(spread_CH_bondday,probs = 0.99,
                                                                           na.rm = TRUE),
                                        low1pt_spread_CH_bondday=quantile(spread_CH_bondday,probs = 0.01,
                                                                          na.rm = TRUE)),
                                   by=c("trd_exctn_dt_s")]

TradeData_filter_masterfundamental[spread_CH_bondday<low1pt_spread_CH_bondday,
                                   `:=`(spread_CH_bondday=low1pt_spread_CH_bondday)]
TradeData_filter_masterfundamental[spread_CH_bondday>high1pt_spread_CH_bondday,
                                   `:=`(spread_CH_bondday=high1pt_spread_CH_bondday)]

print(summary(TradeData_filter_masterfundamental$spread_CH_bondday))


#########################################
# calcualte same_day spread in Choi-Huh #
#########################################
TradeData_filter_masterfundamental[`Quantity(ParVal)_num`>= 1000000 & dum_dealer_to_customer==1,
                                   same_day_num1:=weighted.mean(rptd_pr, w = `Quantity(ParVal)_num`, na.rm = TRUE), by=.(trd_exctn_dt_s,cusip_id)]

TradeData_filter_masterfundamental[`Quantity(ParVal)_num`>= 1000000 & dum_customer_to_dealer==1,
                                   same_day_num2:=weighted.mean(rptd_pr, w = `Quantity(ParVal)_num`, na.rm = TRUE), by=.(trd_exctn_dt_s,cusip_id)]

TradeData_filter_masterfundamental[,
                                   same_day_num:=2*(unique(na.omit(same_day_num1)) - unique(na.omit(same_day_num2))), by=.(trd_exctn_dt_s,cusip_id)]
TradeData_filter_masterfundamental[,
                                   same_day_denom:=unique(na.omit(same_day_num1)) + unique(na.omit(same_day_num2)),by=.(trd_exctn_dt_s,cusip_id)]
TradeData_filter_masterfundamental[,
                                   same_day:=same_day_num/same_day_denom, by=.(trd_exctn_dt_s,cusip_id)]

# winsorized at the 1% level each day
TradeData_filter_masterfundamental[,
                                   `:=`(high1pt_same_day=quantile(same_day,probs = 0.99,
                                                                  na.rm = TRUE),
                                        low1pt_same_day=quantile(same_day,probs = 0.01,
                                                                 na.rm = TRUE)),
                                   by=c("trd_exctn_dt_s")]

TradeData_filter_masterfundamental[same_day<low1pt_same_day,
                                   `:=`(same_day=low1pt_same_day)]
TradeData_filter_masterfundamental[same_day>high1pt_same_day,
                                   `:=`(same_day=high1pt_same_day)]

###After calculating CH and sameday, we merge back the agency-transaction-part
library(lfe)
library(gtools)
nrow(TradeData_filter_masterfundamental)+nrow(TradeData_filter_masterfundamental_agency)
TradeData_filter_masterfundamental<-
  smartbind(TradeData_filter_masterfundamental,
            TradeData_filter_masterfundamental_agency)
nrow(TradeData_filter_masterfundamental)

TradeData_filter_masterfundamental<-as.data.table(TradeData_filter_masterfundamental)

###Bond_day level only
TradeData_filter_masterfundamental[,`:=`(bondday_totvol=sum(`Quantity(ParVal)_num`,na.rm = TRUE)),
                                   by=c("trd_exctn_dt_s","cusip_id")]

TradeData_filter_masterfundamental[,`:=`(bondday_totvol_customer=sum(`Quantity(ParVal)_num`*dum_customer,na.rm = TRUE)),
                                   by=c("trd_exctn_dt_s","cusip_id")]

TradeData_filter_masterfundamental[!is.na(spread1_bondday),`:=`(bondday_totvol_withnonnanspread1=sum(`Quantity(ParVal)_num`,na.rm = TRUE)),
                                   by=c("trd_exctn_dt_s","cusip_id")]

TradeData_filter_masterfundamental[!is.na(spread_CH_bondday),`:=`(count_seq=sequence(.N),
                                                                  bondday_totvol_withnonnanspreadCH=sum(`Quantity(ParVal)_num`,na.rm = TRUE)),
                                   by=c("trd_exctn_dt_s","cusip_id")]

TradeData_filter_masterfundamental[,`:=`(count_seq=sequence(.N)),
                                   by=c("cusip_id","mtrty_dt","trd_exctn_dt_s")]
TradeData_filter_masterfundamental[,`:=`(dum_IG=0,dum_HY=0,dum_NR=0)]
TradeData_filter_masterfundamental[bond_grade=="I",`:=`(dum_IG=1)]
TradeData_filter_masterfundamental[bond_grade=="H",`:=`(dum_HY=1)]
TradeData_filter_masterfundamental[bond_grade=="NR",`:=`(dum_NR=1)]

###calculate bond-level prop of agency trades (vol/num) and cumulative inventory
###using all capacity data "TradeData_filter_masterfundamental_allcapacity"
TradeData_filter_masterfundamental_allcapacity[,`:=`(count_seq=sequence(.N)),
                                               by=c("cusip_id","mtrty_dt","trd_exctn_dt_s")]
TradeData_filter_masterfundamental_allcapacity[,`:=`(dum_IG=0,dum_HY=0,dum_NR=0)]
TradeData_filter_masterfundamental_allcapacity[bond_grade=="I",`:=`(dum_IG=1)]
TradeData_filter_masterfundamental_allcapacity[bond_grade=="H",`:=`(dum_HY=1)]
TradeData_filter_masterfundamental_allcapacity[bond_grade=="NR",`:=`(dum_NR=1)]

###At bond level, vol are in $million
TradeData_filter_masterfundamental_allcapacity[,`:=`(daily_vol_alltrades = sum(`Quantity(ParVal)_num`,na.rm=TRUE)/1e6, 
                                                     daily_num_alltrades = sum(!is.na(`Quantity(ParVal)_num`)),
                                                     daily_vol_customer_bought = sum(dum_dealer_to_customer*`Quantity(ParVal)_num`,na.rm = TRUE)/1e6,
                                                     daily_num_customer_bought = sum(dum_dealer_to_customer,na.rm = TRUE),
                                                     daily_vol_customer_sold = sum(dum_customer_to_dealer*`Quantity(ParVal)_num`,na.rm = TRUE)/1e6,
                                                     daily_num_customer_sold = sum(dum_customer_to_dealer,na.rm = TRUE),
                                                     daily_vol_interdealer = sum(dum_interdealer*`Quantity(ParVal)_num`,na.rm = TRUE)/1e6,
                                                     daily_num_interdealer= sum(dum_interdealer,na.rm = TRUE)), 
                                               by=c("cusip_id","trd_exctn_dt_s")]

TradeData_filter_masterfundamental_allcapacity[,
                                               `:=`(daily_netinflow_vol=daily_vol_customer_sold-daily_vol_customer_bought)]
TradeData_filter_masterfundamental_allcapacity[,`:=`(dum_agencytrade=0,
                                                     dum_principaltrade=0)]
TradeData_filter_masterfundamental_allcapacity[capacity=="A",`:=`(dum_agencytrade=1)]
TradeData_filter_masterfundamental_allcapacity[capacity=="P",`:=`(dum_principaltrade=1)]


TradeData_filter_masterfundamental_allcapacity[,`:=`(daily_num_agencytrade = sum(dum_agencytrade,na.rm = TRUE),
                                                     daily_num_principaltrade = sum(dum_principaltrade,na.rm = TRUE),
                                                     daily_vol_agencytrade = sum(dum_agencytrade*`Quantity(ParVal)_num`,na.rm = TRUE)/1e6,
                                                     daily_vol_principaltrade = sum(dum_principaltrade*`Quantity(ParVal)_num`,na.rm = TRUE)/1e6),
                                               by=c("cusip_id","trd_exctn_dt_s")]

TradeData_filter_masterfundamental_allcapacity[,`:=`(prop_agency_num=daily_num_agencytrade/(daily_num_agencytrade+daily_num_principaltrade),
                                                     prop_agency_vol=daily_vol_agencytrade/(daily_vol_agencytrade+daily_vol_principaltrade))]

###cumulative inventory can only be calculated after dropping duplicate rows and combined with TRACE regular!!!
TradeData_filter_masterfundamental_allcapacity[
  ,
  `:=`(count_seq=sequence(.N)),
  by=c("cusip_id","trd_exctn_dt_s")
  ]

TradeData_filter_masterfundamental_allcapacity_bondday<-TradeData_filter_masterfundamental_allcapacity[count_seq==1]
TradeData_filter_masterfundamental_allcapacity_bondday[,`:=`(dum_IG=0,dum_HY=0,dum_NR=0)]
TradeData_filter_masterfundamental_allcapacity_bondday[bond_grade=="I",`:=`(dum_IG=1)]
TradeData_filter_masterfundamental_allcapacity_bondday[bond_grade=="H",`:=`(dum_HY=1)]
TradeData_filter_masterfundamental_allcapacity_bondday[bond_grade=="NR",`:=`(dum_NR=1)]

# TradeData_filter_masterfundamental_allcapacity_bondday<-
#   TradeData_filter_masterfundamental_allcapacity_bondday[order(cusip_id,trd_exctn_dt_s)]
# TradeData_filter_masterfundamental_allcapacity_bondday[,`:=`(cumulative_inven_change=cumsum(daily_netinflow_vol)),
#                                                by=c("cusip_id")]
# summary(TradeData_filter_masterfundamental_allcapacity_bondday$cumulative_inven_change)
summary(TradeData_filter_masterfundamental_allcapacity_bondday$daily_netinflow_vol)

###bond-level spread1,spreadCH, MIRC and sameday are from TradeData_filter_masterfundamental
TradeData_filter_masterfundamental_bondday<-TradeData_filter_masterfundamental[count_seq==1]
TradeData_filter_masterfundamental_bondday[,`:=`(dum_IG=0,dum_HY=0,dum_NR=0)]
TradeData_filter_masterfundamental_bondday[bond_grade=="I",`:=`(dum_IG=1)]
TradeData_filter_masterfundamental_bondday[bond_grade=="H",`:=`(dum_HY=1)]
TradeData_filter_masterfundamental_bondday[bond_grade=="NR",`:=`(dum_NR=1)]

TradeData_filter_masterfundamental_bondday_to_merge<-
  TradeData_filter_masterfundamental_bondday[,
                                             c("cusip_id","trd_exctn_dt_s",
                                               "MIRC_bondday","same_day","spread_CH_bondday","spread1_bondday"),
                                             with=FALSE]

TradeData_filter_masterfundamental_allcapacity_bondday[,
                                                       `:=`(trd_exctn_dt_s=as.character(trd_exctn_dt_s))]

TradeData_filter_masterfundamental_bondday_to_merge[,
                                                    `:=`(trd_exctn_dt_s=as.character(trd_exctn_dt_s))]

TradeData_filter_masterfundamental_allcapacity_bondday_withbondtransactioncost<-
  as.data.table(left_join(TradeData_filter_masterfundamental_allcapacity_bondday,
                          TradeData_filter_masterfundamental_bondday_to_merge,
                          by=c("cusip_id","trd_exctn_dt_s")))

###Need to make sure colnames are the same
###setwd("~/Corporate_Bond_TRACE/data/TRACE_EOD")
###Read generated transaction-level data on standard TRACE, and make sure EOD transaction-level data has
###same fields with the standard-version one. 
Reg_Data_transactionwise_EOD<-fread(paste0(current_path,"Reg_Data_transactionwise_EOD_11072020.csv"))
Reg_Data_bondwise_EOD<-fread(paste0(current_path,"Reg_Data_bondwise_EOD_11072020.csv"))

###setwd("~/Corporate_Bond_TRACE/data/TRACE_standard")

Var1<-c(colnames(TradeData_filter_masterfundamental)[grep("issuer_nm",
                                                          c(colnames(TradeData_filter_masterfundamental))):
                                                       grep("mean_grade",
                                                            c(colnames(TradeData_filter_masterfundamental)))])
Var2<-c(colnames(TradeData_filter_masterfundamental)[grep("issue_id.x",
                                                          c(colnames(TradeData_filter_masterfundamental))):
                                                       grep("naics_code",
                                                            c(colnames(TradeData_filter_masterfundamental)))])
Var3<-c(colnames(TradeData_filter_masterfundamental)[grep("dum_interdealer",
                                                          c(colnames(TradeData_filter_masterfundamental))):
                                                       grep("dum_NR",
                                                            c(colnames(TradeData_filter_masterfundamental)))])

Var4<-c(colnames(TradeData_filter_masterfundamental_allcapacity_bondday_withbondtransactioncost)[grep("dum_TTM5yless",
                                                                                                      c(colnames(TradeData_filter_masterfundamental_allcapacity_bondday_withbondtransactioncost))):
                                                                                                   grep("spread1_bondday",
                                                                                                        c(colnames(TradeData_filter_masterfundamental_allcapacity_bondday_withbondtransactioncost)))])
Reg_Data_transactionwise<-
  TradeData_filter_masterfundamental[
    ,
    c("cusip_id","company_symbol","bond_sym_id","trd_exctn_dt_s","new_datetime_exctn","time_to_maturity","time_to_offering",
      "credit_rating_code_RD",
      "amountoutstanding_thousands","offering_amt","bsym","sub_prd_type","rptd_pr","dum_Quantity(ParVal)_5MM",
      "dum_Quantity(ParVal)_1MM","Quantity(ParVal)_num",Var1,Var2,Var3,
      "issuername", "couponrate",  "ustreasuryyield",                  
      "bondtype",  "industrygroup",  "industrysubgroup",                 
      "coupontype", "moodysrating","standardpoorsrating"),
    with=FALSE
    ]
Reg_Data_bondwise<-
  TradeData_filter_masterfundamental_allcapacity_bondday_withbondtransactioncost[
    ,
    c("cusip_id","company_symbol","bond_sym_id","trd_exctn_dt_s","time_to_maturity","time_to_offering", "credit_rating_code_RD",
      "amountoutstanding_thousands","offering_amt","bsym","sub_prd_type",Var4),
    with=FALSE
    ]
Reg_Data_bondwise[,`:=`(MIRC_bondday.x=NULL)]
setnames(Reg_Data_bondwise,c("MIRC_bondday.y"),c("MIRC_bondday"))

###truncate ins_amount_outstanding ($1,000)
ql_ins_amt_out<-
  quantile(Reg_Data_transactionwise$amountoutstanding_thousands,probs = 0.005,na.rm = TRUE)
Reg_Data_transactionwise[
  (amountoutstanding_thousands<ql_ins_amt_out),
  `:=`(amountoutstanding_thousands=ql_ins_amt_out)
  ]
Reg_Data_bondwise[
  (amountoutstanding_thousands<ql_ins_amt_out),
  `:=`(amountoutstanding_thousands=ql_ins_amt_out)
  ]
min(Reg_Data_bondwise$trd_exctn_dt_s)
max(Reg_Data_bondwise$trd_exctn_dt_s)
min(Reg_Data_transactionwise$trd_exctn_dt_s)
max(Reg_Data_transactionwise$trd_exctn_dt_s)

###for any variable that with ".x", delete the ".x".
Oldnames_Reg_Data_transactionwise<-
  c("bond_sym_id","new_datetime_exctn","time_to_maturity","time_to_offering","amountoutstanding_thousands","sub_prd_type",
    "dum_Quantity(ParVal)_5MM","dum_Quantity(ParVal)_1MM","Quantity(ParVal)_num","dum_interdealer","dum_customer","dum_dealer_to_customer",
    "dum_customer_to_dealer","time_to_maturity_yr","category_trade_size",
    "issue_id.x",
    "issuer_id.x",
    "prospectus_issuer_name.x","issue_cusip.x","issue_name.x")
Newnames_Reg_Data_transactionwise<-
  c("symbol","datetime_exctn","TTM","TTO","Amt_Out","sub_product",
    "dum_5MM","dum_1MM","entrd_vol_qt","dum_DD","dum_C","dum_D2C",
    "dum_C2D","TTM_yr","category_size",
    "issue_id",
    "issuer_id",
    "prospectus_issuer_name","issue_cusip","issue_name")

setnames(Reg_Data_transactionwise,
         c(Oldnames_Reg_Data_transactionwise),
         c(Newnames_Reg_Data_transactionwise))

###drop unmatched columns
Reg_Data_transactionwise_EOD[
  ,
  `:=`(mtrty_dt=NULL,num_grade=NULL,country_domicile_2=NULL)
]
print(setdiff(c(colnames(Reg_Data_transactionwise_EOD)),
              c(colnames(Reg_Data_transactionwise))))
print(setdiff(c(colnames(Reg_Data_transactionwise)),
              c(colnames(Reg_Data_transactionwise_EOD))))

# fwrite(Reg_Data_transactionwise,
#        "Reg_Data_transactionwise_regular.csv")
fwrite(Reg_Data_transactionwise,
       "Reg_Data_transactionwise_regular_11072020.csv")

Oldnames_Reg_Data_bondwise<-
  c("bond_sym_id","time_to_maturity","time_to_offering","amountoutstanding_thousands","sub_prd_type",
    "time_to_maturity_yr","daily_vol_alltrades","daily_num_alltrades",      
    "daily_vol_customer_bought", "daily_num_customer_bought", "daily_vol_customer_sold",   "daily_num_customer_sold",  
    "daily_vol_interdealer", "daily_num_interdealer",     "daily_netinflow_vol",  "daily_num_agencytrade","daily_num_principaltrade","daily_vol_agencytrade",   
    "daily_vol_principaltrade")
Newnames_Reg_Data_bondwise<-
  c("symbol","TTM","TTO","Amt_Out","sub_product",
    "TTM_yr","day_vol","day_num",      
    "day_vol_D2C", "day_num_D2C", "day_vol_C2D",   "day_num_C2D",  
    "day_vol_DD", "day_num_DD",     "day_netinflow", "day_num_agency","day_num_principal","day_vol_agency",   
    "day_vol_principal")
setnames(Reg_Data_bondwise,
         c(Oldnames_Reg_Data_bondwise),
         c(Newnames_Reg_Data_bondwise))

# setnames(Reg_Data_bondwise_EOD,
#          c("country_domicile_2"),
#          c("country_domicile"))

# fwrite(Reg_Data_bondwise,
#        "Reg_Data_bondwise_regular.csv")
fwrite(Reg_Data_bondwise,
       "Reg_Data_bondwise_regular_11072020.csv")

############################################################
###combine standard and EOD TRACE transaction-level data. 
############################################################

setwd("~/Corporate_Bond_TRACE/data/TRACE_bondlevel_analysis")
max(Reg_Data_transactionwise$trd_exctn_dt_s)
library(gtools)
Combine_Reg_Data_transactionwise_regular_EOD<-
  smartbind(Reg_Data_transactionwise,
        Reg_Data_transactionwise_EOD)
nrow(Reg_Data_transactionwise)+nrow(Reg_Data_transactionwise_EOD)
nrow(Combine_Reg_Data_transactionwise_regular_EOD)

Combine_Reg_Data_transactionwise_regular_EOD<-
  as.data.table(Combine_Reg_Data_transactionwise_regular_EOD)

for (colname in names(Combine_Reg_Data_transactionwise_regular_EOD)) {
  if (is.character(Combine_Reg_Data_transactionwise_regular_EOD[[colname]])) {
    Combine_Reg_Data_transactionwise_regular_EOD[[colname]] <- as.factor(Combine_Reg_Data_transactionwise_regular_EOD[[colname]])
  }
}

write_dta(Combine_Reg_Data_transactionwise_regular_EOD,
          "Combine_Reg_Data_transactionwise_regular_EOD.dta")
# fwrite(Combine_Reg_Data_transactionwise_regular_EOD,
#        "Combine_Reg_Data_transactionwise_regular_EOD.csv")
fwrite(Combine_Reg_Data_transactionwise_regular_EOD,
       "Combine_Reg_Data_transactionwise_regular_EOD_11072020.csv")

print(setdiff(c(colnames(Reg_Data_bondwise_EOD)),
              c(colnames(Reg_Data_bondwise))))
print(setdiff(c(colnames(Reg_Data_bondwise)),
              c(colnames(Reg_Data_bondwise_EOD))))

max(Reg_Data_bondwise$trd_exctn_dt_s)
Combine_Reg_Data_bondwise_regular_EOD<-
  smartbind(Reg_Data_bondwise,
            Reg_Data_bondwise_EOD)
nrow(Reg_Data_bondwise)+nrow(Reg_Data_bondwise_EOD)
nrow(Combine_Reg_Data_bondwise_regular_EOD)

Combine_Reg_Data_bondwise_regular_EOD<-
  as.data.table(Combine_Reg_Data_bondwise_regular_EOD)

for (colname in names(Combine_Reg_Data_bondwise_regular_EOD)) {
  if (is.character(Combine_Reg_Data_bondwise_regular_EOD[[colname]])) {
    Combine_Reg_Data_bondwise_regular_EOD[[colname]] <- as.factor(Combine_Reg_Data_bondwise_regular_EOD[[colname]])
  }
}

###cumulative inventory
Combine_Reg_Data_bondwise_regular_EOD<-
  Combine_Reg_Data_bondwise_regular_EOD[order(cusip_id,trd_exctn_dt_s)]
summary(Combine_Reg_Data_bondwise_regular_EOD$day_netinflow)
Combine_Reg_Data_bondwise_regular_EOD[
  ,
  `:=`(cuminven=cumsum(day_netinflow)),
  by=c("cusip_id")
]
summary(Combine_Reg_Data_bondwise_regular_EOD$cuminven)

write_dta(Combine_Reg_Data_bondwise_regular_EOD,
          "Combine_Reg_Data_bondwise_regular_EOD.dta")
# fwrite(Combine_Reg_Data_bondwise_regular_EOD,
#        "Combine_Reg_Data_bondwise_regular_EOD.csv")
fwrite(Combine_Reg_Data_bondwise_regular_EOD,
       "Combine_Reg_Data_bondwise_regular_EOD_11072020.csv")

Combine_Reg_Data_bondwise_regular_EOD<-
  as.data.table(Combine_Reg_Data_bondwise_regular_EOD)
Combine_Reg_Data_transactionwise_regular_EOD<-
  as.data.table(Combine_Reg_Data_transactionwise_regular_EOD)
table(Combine_Reg_Data_bondwise_regular_EOD[dum_crisis==1,
                                   c("trd_exctn_dt_s"),with=FALSE])
table(Combine_Reg_Data_transactionwise_regular_EOD[dum_crisis==1,
                                   c("trd_exctn_dt_s"),with=FALSE])
table(Combine_Reg_Data_bondwise_regular_EOD[dum_intervention==1,
                                            c("trd_exctn_dt_s"),with=FALSE])
table(Combine_Reg_Data_transactionwise_regular_EOD[dum_intervention==1,
                                                   c("trd_exctn_dt_s"),with=FALSE])

summary(Combine_Reg_Data_transactionwise_regular_EOD$MIRC)
summary(Combine_Reg_Data_transactionwise_regular_EOD$MIRC_bondday)
summary(Combine_Reg_Data_bondwise_regular_EOD$MIRC)
summary(Combine_Reg_Data_bondwise_regular_EOD$MIRC_bondday)

############################################
############################################
###Drop holidays and weekends
nrow(Combine_Reg_Data_transactionwise_regular_EOD)
Combine_Reg_Data_transactionwise_regular_EOD[,`:=`(weekday=weekdays(as.Date(trd_exctn_dt_s)))]
Combine_Reg_Data_transactionwise_regular_EOD<-Combine_Reg_Data_transactionwise_regular_EOD[!(weekday=="Saturday"|weekday=="Sunday")]

Combine_Reg_Data_transactionwise_regular_EOD<-Combine_Reg_Data_transactionwise_regular_EOD[!(trd_exctn_dt_s %in% c("2020-01-04","2020-01-05","2020-01-11","2020-01-12","2020-01-18","2020-01-19","2020-01-25","2020-01-26",
                               "2020-02-01","2020-02-02","2020-02-08","2020-02-09","2020-02-15","2020-02-16","2020-02-22","2020-02-23",
                               "2020-02-29","2020-03-01","2020-03-07","2020-03-08","2020-03-14","2020-03-15","2020-03-21","2020-03-22",
                               "2020-03-28","2020-03-29",
                               "2020-01-20","2020-02-17",
                               "2020-04-10","2020-05-25"))]
nrow(Combine_Reg_Data_transactionwise_regular_EOD)

nrow(Combine_Reg_Data_bondwise_regular_EOD)
Combine_Reg_Data_bondwise_regular_EOD[,`:=`(weekday=weekdays(as.Date(trd_exctn_dt_s)))]
Combine_Reg_Data_bondwise_regular_EOD<-Combine_Reg_Data_bondwise_regular_EOD[!(weekday=="Saturday"|weekday=="Sunday")]

Combine_Reg_Data_bondwise_regular_EOD<-Combine_Reg_Data_bondwise_regular_EOD[!(trd_exctn_dt_s %in% c("2020-01-04","2020-01-05","2020-01-11","2020-01-12","2020-01-18","2020-01-19","2020-01-25","2020-01-26",
                                 "2020-02-01","2020-02-02","2020-02-08","2020-02-09","2020-02-15","2020-02-16","2020-02-22","2020-02-23",
                                 "2020-02-29","2020-03-01","2020-03-07","2020-03-08","2020-03-14","2020-03-15","2020-03-21","2020-03-22",
                                 "2020-03-28","2020-03-29",
                                 "2020-01-20","2020-02-17",
                                 "2020-04-10","2020-05-25"))]
nrow(Combine_Reg_Data_bondwise_regular_EOD)

table(Combine_Reg_Data_bondwise_regular_EOD[dum_crisis==1,
                                            c("trd_exctn_dt_s"),with=FALSE])
table(Combine_Reg_Data_transactionwise_regular_EOD[dum_crisis==1,
                                                   c("trd_exctn_dt_s"),with=FALSE])
table(Combine_Reg_Data_bondwise_regular_EOD[dum_intervention==1,
                                            c("trd_exctn_dt_s"),with=FALSE])
table(Combine_Reg_Data_transactionwise_regular_EOD[dum_intervention==1,
                                                   c("trd_exctn_dt_s"),with=FALSE])


# fwrite(Combine_Reg_Data_bondwise_regular_EOD,
#        "Combine_Reg_Data_bondwise_regular_EOD.csv")
fwrite(Combine_Reg_Data_bondwise_regular_EOD,
       "Combine_Reg_Data_bondwise_regular_EOD_11072020.csv")
write_dta(Combine_Reg_Data_bondwise_regular_EOD,
          "Combine_Reg_Data_bondwise_regular_EOD.dta")
# fwrite(Combine_Reg_Data_transactionwise_regular_EOD,
#        "Combine_Reg_Data_transactionwise_regular_EOD.csv")
fwrite(Combine_Reg_Data_transactionwise_regular_EOD,
       "Combine_Reg_Data_transactionwise_regular_EOD_11072020.csv")
write_dta(Combine_Reg_Data_transactionwise_regular_EOD,
          "Combine_Reg_Data_transactionwise_regular_EOD.dta")


#############################################################
###Merge with new Fallen angels dummies
###(Note: this step is optional, if not merging with any angels dummies,
###we can directly put "Combine_Reg_Data_transactionwise_regular_EOD.dta" as input 
###for empirical analysis in "Replicating_empirical_results.R")

###setwd("~/Corporate_Bond_TRACE/data/TRACE_standard/Fallen_Angels")
All_List_Fallen_Angel_to_merge_TRACE<-
  fread(paste0(current_path,"All_List_Fallen_Angel_to_merge_TRACE.csv"))
All_List_Fallen_Angel_to_merge_TRACE<-
  All_List_Fallen_Angel_to_merge_TRACE[,c("cusip","bond_type","date_IG_to_HY","issuername"),with=FALSE]
setnames(All_List_Fallen_Angel_to_merge_TRACE,
         c(colnames(All_List_Fallen_Angel_to_merge_TRACE)),
         c("cusip_id",c(paste0("FA_",c("bond_type","date_IG_to_HY","issuername")))))

nrow(All_List_Fallen_Angel_to_merge_TRACE)
nrow(unique(All_List_Fallen_Angel_to_merge_TRACE[,c("cusip_id"),with=FALSE]))

setwd("~/Corporate_Bond_TRACE/data/TRACE_bondlevel_analysis")

Combine_Reg_Data_transactionwise_regular_EOD_creditrating_RD<-
  Combine_Reg_Data_transactionwise_regular_EOD
nrow(Combine_Reg_Data_transactionwise_regular_EOD_creditrating_RD)
Combine_Reg_Data_transactionwise_regular_EOD_creditrating_RD_FA<-
  as.data.table(left_join(Combine_Reg_Data_transactionwise_regular_EOD_creditrating_RD,
                          All_List_Fallen_Angel_to_merge_TRACE,
                          by=c("cusip_id")))
nrow(Combine_Reg_Data_transactionwise_regular_EOD_creditrating_RD_FA)
nrow(Combine_Reg_Data_transactionwise_regular_EOD_creditrating_RD_FA[!is.na(FA_bond_type)])

Combine_Reg_Data_transactionwise_regular_EOD_creditrating_RD_FA[
  ,
  `:=`(dum_FA=0)
]
Combine_Reg_Data_transactionwise_regular_EOD_creditrating_RD_FA[
  !is.na(FA_bond_type),
  `:=`(dum_FA=1)
]
table(Combine_Reg_Data_transactionwise_regular_EOD_creditrating_RD_FA$dum_FA)
# fwrite(Combine_Reg_Data_transactionwise_regular_EOD_creditrating_RD_FA,
#        "Combine_Reg_Data_transactionwise_regular_EOD_creditrating_RD_FA.csv")

fwrite(Combine_Reg_Data_transactionwise_regular_EOD_creditrating_RD_FA_11072020,
       "Combine_Reg_Data_transactionwise_regular_EOD_creditrating_RD_FA_11072020.csv")







